For some types of analysis, using empirical samples won't provide enough variation to discover all the edge cases. So we want to create a file that contains a huge number of different variant types.
For this bit of data munging, we'll be using a few different Python libraries
VCF would be nice in the end, so we will construct our dataframe to closely model the VCF spec.
For each positon we want to representent:
#CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO |
7 | 117144307 | . | C | G | . | PASS | DP=100 |
7 | 117144307 | . | C | T | . | PASS | DP=100 |
7 | 117144307 | . | C | A | . | PASS | DP=100 |
7 | 117144307 | . | C | CA | . | PASS | DP=100 |
7 | 117144307 | . | C | CG | . | PASS | DP=100 |
7 | 117144307 | . | C | CC | . | PASS | DP=100 |
7 | 117144307 | . | C | CT | . | PASS | DP=100 |
7 | 117144307 | . | C | CAG | . | PASS | DP=100 |
7 | 117144307 | . | C | CTG | . | PASS | DP=100 |
7 | 117144307 | . | C | CCAT | . | PASS | DP=100 |
7 | 117144307 | . | C | CAGT | . | PASS | DP=100 |
7 | 117144307 | . | CT | C | . | PASS | DP=100 |
7 | 117144307 | . | CTG | C | . | PASS | DP=100 |
7 | 117144307 | . | CTGG | C | . | PASS | DP=100 |
#Import our libs up front
import itertools
import random
import os
import datetime
import pandas as pd
import numpy as np
from HTSeq import FastaReader
First, we just need to get the table from Genome Browse into python. Since we we've already imported panadas, we can use it's parsing tools. But, we won't really use pandas the way it is intended to be used until later
import_df = pd.read_csv("./EnsamblCFTR.tsv", sep="\t")
tx_exon_dict = dict()
# in general this is poor practice with pandas dataframes, but this df is very small
# and a pure python data structure is more convienent
for i in range(len(import_df)):
exon_start_stop = zip(map(int, import_df["Exon Starts"][i].split(",")), map(int, import_df["Exon Stops"][i].split(",")))
tx_exon_dict[import_df["Transcript Name"][i]] = exon_start_stop
Now we want to expand the start and stop by 100bp to make sure that we capture any variation caused by mutations in the immeadiate surrounding area.
expanded_tx_exon_dict = dict()
for tx_name, exon_coordinate_list in tx_exon_dict.items():
new_coordinate_list = []
for start, stop in exon_coordinate_list:
new_start = start - 100
new_stop = stop + 100
new_coordinate_list.append((new_start, new_stop))
expanded_tx_exon_dict[tx_name] = new_coordinate_list
We really don't care about the transcript names. So the first thing we'll do is collapse our dict into an interval list.
interval_list = []
[interval_list.extend(x) for x in expanded_tx_exon_dict.itervalues()]
print(interval_list)
[(117119916, 117120301), (117144206, 117144517), (117148987, 117149296), (117170852, 117171268), (117174229, 117174519), (117175201, 117175565), (117176501, 117176827), (117180053, 117180500), (117181969, 117182262), (117199417, 117199809), (117227692, 117227987), (117230306, 117230593), (117231887, 117232811), (117234883, 117235212), (117242779, 117243017), (117243485, 117243936), (117246627, 117246907), (117250472, 117250823), (117251534, 117251962), (117254566, 117254867), (117267475, 117267924), (117282391, 117282747), (117292795, 117293085), (117304641, 117305014), (117305412, 117305718), (117306861, 117308819), (117329666, 117330075), (117355711, 117356125), (117292796, 117293085), (117304641, 117305014), (117305412, 117305718), (117355711, 117356102), (117349970, 117350855), (117354124, 117354419), (117355711, 117356027), (117251570, 117251962), (117254566, 117254867), (117267475, 117268064), (117105737, 117105985), (117115644, 117115962), (117144206, 117144462), (117119916, 117120301), (117144206, 117144517), (117148987, 117149296), (117170852, 117171268), (117174229, 117174519), (117175201, 117175565), (117176501, 117176827), (117180053, 117180500), (117181969, 117182262), (117188594, 117188977), (117199417, 117199809), (117227692, 117227987), (117230306, 117230593), (117231887, 117232811), (117234883, 117235212), (117242779, 117243017), (117243485, 117243936), (117246627, 117246907), (117250472, 117250823), (117251534, 117251962), (117254566, 117254867), (117267475, 117267924), (117282391, 117282747), (117292795, 117293085), (117304641, 117305014), (117305412, 117305718), (117306861, 117308815), (117199591, 117199809), (117204622, 117204941), (117227692, 117227903), (117350085, 117350484), (117350564, 117350855), (117352294, 117352558), (117354124, 117354419), (117355711, 117355981), (117119257, 117119499), (117119415, 117119848), (117144206, 117144517), (117148987, 117149296), (117170852, 117171132), (117120048, 117120301), (117144206, 117144517), (117148987, 117149296), (117170852, 117171268), (117175201, 117175565), (117176501, 117176827), (117180053, 117180500), (117181969, 117182262), (117188594, 117188977), (117199417, 117199809), (117227692, 117227987), (117230306, 117230593), (117231887, 117232811), (117234883, 117235212), (117242779, 117243017), (117243485, 117243936), (117246627, 117246907), (117250472, 117250823), (117251534, 117251962), (117254566, 117254867), (117267475, 117267924), (117282391, 117282747), (117292795, 117293085), (117304641, 117305014), (117305412, 117305718), (117306861, 117307225)]
Up to this point, we haven't done anything too interesting. Well put on your algorithm helmet and get ready!
What we want it the minimum set of intervals that contain all the intervals in our raw list.
# sort by the start position
sorted_by_start = sorted(interval_list, key=lambda x: x[0])
def collapse_intervals(intervals):
cur_start, cur_stop = intervals[0]
for next_start, next_stop in intervals[1:]:
if cur_stop < next_start:
yield cur_start, cur_stop
cur_start, cur_stop = next_start, next_stop
else:
cur_stop = next_stop
yield cur_start, cur_stop
Even with something pretty straight forward, it's good to make sure that the function works as you'd expect
test_intervals = [(1,4), (2,5), (5,7), (10,13)] #min set [(1,7), (10,13)]
test_intervals2 = [(1,4), (2,5), (6,7), (10,13), (12,15)] #min set [(1,5), (6,7), (10,15)]
print(list(collapse_intervals(test_intervals)) == [(1,7), (10,13)])
print(list(collapse_intervals(test_intervals2)) == [(1,5), (6,7), (10,15)])
True True
collapsed_interval_list = list(collapse_intervals(sorted_by_start))
print("Before collapsing we have: " + str(len(interval_list)) + " intervals")
print("After collapsing we have: " + str(len(collapsed_interval_list)) + " intervals")
print("\nOur new interval set:\n")
print(collapsed_interval_list)
Before collapsing we have: 107 intervals After collapsing we have: 37 intervals Our new interval set: [(117105737, 117105985), (117115644, 117115962), (117119257, 117119848), (117119916, 117120301), (117144206, 117144517), (117148987, 117149296), (117170852, 117171268), (117174229, 117174519), (117175201, 117175565), (117176501, 117176827), (117180053, 117180500), (117181969, 117182262), (117188594, 117188977), (117199417, 117199809), (117204622, 117204941), (117227692, 117227987), (117230306, 117230593), (117231887, 117232811), (117234883, 117235212), (117242779, 117243017), (117243485, 117243936), (117246627, 117246907), (117250472, 117250823), (117251534, 117251962), (117254566, 117254867), (117267475, 117267924), (117282391, 117282747), (117292795, 117293085), (117304641, 117305014), (117305412, 117305718), (117306861, 117307225), (117329666, 117330075), (117349970, 117350484), (117350564, 117350855), (117352294, 117352558), (117354124, 117354419), (117355711, 117355981)]
We need to have quick access to possible variants. We will be createing all SNPs and 1bp insertions. To keep the dataset manageable we'll only use 2 of the 2bp and 2 of 3bp options
BASES = ['A', 'G', 'C', 'T']
TWOBASES = [ ''.join(combo) for combo in itertools.permutations(BASES, 2)]
THREEBASES = [ ''.join(combo) for combo in itertools.permutations(BASES, 3)]
print BASES
print TWOBASES
print THREEBASES
['A', 'G', 'C', 'T'] ['AG', 'AC', 'AT', 'GA', 'GC', 'GT', 'CA', 'CG', 'CT', 'TA', 'TG', 'TC'] ['AGC', 'AGT', 'ACG', 'ACT', 'ATG', 'ATC', 'GAC', 'GAT', 'GCA', 'GCT', 'GTA', 'GTC', 'CAG', 'CAT', 'CGA', 'CGT', 'CTA', 'CTG', 'TAG', 'TAC', 'TGA', 'TGC', 'TCA', 'TCG']
To keep things in organized we'll create a function that returns a list of the variants we want to seed at that position. We'll create all the SNPs and the insertions following the current base we are working on.
The fields we want in each row are:
Since ID, QUAL, FILTER, and INFO don't really exist as these variants didn't come from real sequence data, we'll just allow them to be missing and set the missing value on export. We can hard code in our FORMAT and SAMPLE column to contain the genotype information for this sample.
In order to get the reference allele at each position, we downloaded the hg19 chr7 fasta file from http://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/chr7.fa.gz. You could easily do some simple file and string operations to get the reference from the position out of this file, but I chose to use the HTSeq library, so that I can pass the gzip file in directly and not deal with iterating over lines.
#treat the fasta reader like an iterator and get the first item
chr7_seq = FastaReader("./chr7.fa.gz").__iter__().next()
def get_variants(chr, pos):
variant_list = []
#subtract 1 because python lists are 0-indexed; add 3 b/c python list slicing is exclusive of upper index
ref = chr7_seq[pos-1:pos+3].seq.upper()
variant_list.extend(get_snps(chr, pos, ref))
variant_list.extend(get_insertions(chr, pos, ref))
variant_list.extend(get_deletions(chr, pos, ref))
return variant_list
def get_snps(chr, pos, ref):
snp_list = []
ref = ref[0]
for base in BASES:
if base == ref: continue
snp_list.append({'#CHROM': chr, 'POS': pos, 'REF': ref, 'ALT': base})
return snp_list
def get_insertions(chr, pos, ref):
ins_list = []
ref = ref[0]
for base in itertools.chain(BASES, random.sample(TWOBASES, 2), random.sample(THREEBASES, 2)):
ins_list.append({'#CHROM': chr, 'POS': pos,'REF': ref[0], 'ALT': ref[0] + base})
return ins_list
def get_deletions(chr, pos, ref):
del_list = []
del_list.append({'#CHROM': chr, 'POS': pos, 'REF': ref[0:2], 'ALT': ref[0]})
del_list.append({'#CHROM': chr, 'POS': pos, 'REF': ref[0:3], 'ALT': ref[0]})
del_list.append({'#CHROM': chr, 'POS': pos, 'REF': ref, 'ALT': ref[0]})
return del_list
col_names= ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER','INFO','FORMAT','SAMPLE']
variant_df = pd.DataFrame(columns=col_names)
print variant_df
for interval in collapsed_interval_list:
pos, stop = interval
while(pos < stop):
var_dict = get_variants(7, pos)
variant_df = variant_df.append(var_dict)
pos += 1
#hard code in our genotype information
variant_df['FORMAT'] = 'GT'
variant_df['SAMPLE'] = '0/1'
Empty DataFrame Columns: [#CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLE] Index: []
variant_df.head()
#CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO | FORMAT | SAMPLE | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 7 | 117105737 | NaN | C | A | NaN | NaN | NaN | GT | 0/1 |
1 | 7 | 117105737 | NaN | C | G | NaN | NaN | NaN | GT | 0/1 |
2 | 7 | 117105737 | NaN | C | T | NaN | NaN | NaN | GT | 0/1 |
3 | 7 | 117105737 | NaN | C | CA | NaN | NaN | NaN | GT | 0/1 |
4 | 7 | 117105737 | NaN | C | CG | NaN | NaN | NaN | GT | 0/1 |
Now we'd like to create a VCF from our dataframe. Since this is a one off vcf file, we'll just manually construct a header and then append our column headers and dataframe values.
vcf_header = """##fileformat=VCFv4.1
##fileDate={0}
##source=CFTR_Variant_Project
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
"""
vcf_header = vcf_header.format(datetime.datetime.utcnow().strftime("%Y%m%d"))
'\n'.join([vcf_header, ('\t'.join(variant_df.columns)), '\t'.join(col_names)])
'##fileformat=VCFv4.1\n##fileDate=20140501\n##source=CFTR_Variant_Project\n##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE'
output_file = "CFTR_artifical_variant.vcf"
try:
os.remove(output_file)
except OSError:
pass
with open(output_file, 'a') as f:
f.write(vcf_header)
variant_df.to_csv(f, sep='\t', na_rep='.', index=False)
We might want to come back and play with our dataframe without having to regenerate all this data. HDF5 is a great way to store Pandas dataframes.
store_file="store.h5"
try:
os.remove(store_file)
except OSError:
pass
store = pd.HDFStore(store_file)
store['variant_df'] = variant_df
If we import this vcf into Genome Browse, we can see our worke. First, notice in the zoomed out view that our variants have covered all of the exonic regions in all the the transcripts in the Ensembl database for CFTR. Second, upon zooming in we can see our deletions of various sizes in pink, our SNPs in the midle multi-color blocks, and our insertions in the small orange bars in the lower part of the view.